7.) Simulation by R

a.)

x <- seq(1, 100)
x
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100

b.)

w <- 2 + 0.5*x
w
##   [1]  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0  7.5  8.0  8.5  9.0  9.5
##  [16] 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 16.0 16.5 17.0
##  [31] 17.5 18.0 18.5 19.0 19.5 20.0 20.5 21.0 21.5 22.0 22.5 23.0 23.5 24.0 24.5
##  [46] 25.0 25.5 26.0 26.5 27.0 27.5 28.0 28.5 29.0 29.5 30.0 30.5 31.0 31.5 32.0
##  [61] 32.5 33.0 33.5 34.0 34.5 35.0 35.5 36.0 36.5 37.0 37.5 38.0 38.5 39.0 39.5
##  [76] 40.0 40.5 41.0 41.5 42.0 42.5 43.0 43.5 44.0 44.5 45.0 45.5 46.0 46.5 47.0
##  [91] 47.5 48.0 48.5 49.0 49.5 50.0 50.5 51.0 51.5 52.0

c.)

set.seed(123)
myNormDist <- rnorm(100, sd = 5)
cat("The mean is ", mean(myNormDist), "\n")
## The mean is  0.4520295
cat("The standard deviation is ", sd(myNormDist), "\n")
## The standard deviation is  4.564079
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
Norm.df <- data.frame(myNormDist)

ggplotly(ggplot(data = Norm.df, aes(x = myNormDist)) + geom_histogram(bins = 15, fill = "#8a5789"))

We notice that even though we create a normal distribution with defined mean and standard deviation, we observe a differing sample mean and standard deviation. Observing our histogram, we also can see that our distribution is not perfectly symmetric.

d.)

y <- myNormDist + w
y
##   [1] -0.3023782  1.8491126 11.2935416  4.3525420  5.1464387 13.5753249
##   [7]  7.8045810 -0.3253062  3.0657357  4.7716901 13.6204090  9.7990691
##  [13] 10.5038573  9.5534136  6.7207943 18.9345657 12.9892524  1.1669142
##  [19] 15.0067795  9.6360430  7.1608815 11.9101254  8.3699778 10.3555439
##  [25] 11.3748037  6.5665334 19.6889352 16.7668656 10.8093153 23.2690746
##  [31] 19.6323211 16.5246426 22.9756283 23.3906674 23.6079054 23.4432013
##  [37] 23.2695883 20.6904414 19.9701867 20.0976450 19.0264651 21.9604136
##  [43] 17.1730182 34.8447798 30.5398100 19.3844571 23.4855758 23.6667232
##  [49] 30.3998256 26.5831547 28.7665926 27.8572662 28.2856477 35.8430114
##  [55] 28.3711451 37.5823530 22.7562360 33.9230687 32.1192712 33.0797078
##  [61] 34.3981974 30.4883827 31.8339631 28.9071231 29.1410439 36.5176432
##  [67] 37.7410489 36.2650211 41.1113373 47.2504234 35.0448442 26.4541556
##  [73] 43.5286926 35.4539962 36.0599569 45.1278568 39.0761350 34.8964114
##  [79] 42.4065174 41.3055432 42.5288209 44.9264020 41.6466998 47.2218827
##  [85] 43.3975672 46.6589098 50.9841951 48.1759075 44.8703421 52.7440381
##  [91] 52.4675193 50.7419848 49.6936587 45.8604696 56.3032622 46.9987021
##  [97] 61.4366650 58.6630531 50.3214982 46.8678955

e.)

Norm.df["y"] <- y
names(Norm.df) <- c("x", "y")
head(Norm.df)
##            x          y
## 1 -2.8023782 -0.3023782
## 2 -1.1508874  1.8491126
## 3  7.7935416 11.2935416
## 4  0.3525420  4.3525420
## 5  0.6464387  5.1464387
## 6  8.5753249 13.5753249
# Base size for labels is 11
# https://ggplot2.tidyverse.org/reference/ggtheme.html
ggplotly(ggplot(data = Norm.df, aes(x = x, y = y)) + 
           geom_point(color = "#7eb6ff") + 
           ggtitle("Y Versus X") + 
           theme_grey(base_size = 11*1.5))

f.)

Beta1 <-
  sum((Norm.df[, 1] - mean(Norm.df[, 1])) * (Norm.df[, 2] - mean(Norm.df[, 2]))) / sum((Norm.df[, 1] - mean(Norm.df[, 1])) ^2)
Beta0 <- mean(Norm.df[,2]) - Beta1 * mean(Norm.df[, 1])

# Create column for predictions
Norm.df["Ypred"] <- Beta0 + Beta1 * Norm.df["x"]


ggplotly(ggplot(data = Norm.df, aes(x = x, y = y)) + 
           geom_point(color = "#7eb6ff") + 
            geom_line(data = Norm.df, aes(x = x, y = Ypred)) + 
           ggtitle("Y Versus X") + 
           theme_grey(base_size = 11*1.5))
Norm.df["ei"] <- Norm.df["y"] - Norm.df["Ypred"]

ggplotly(ggplot(data = Norm.df, aes(x = x, y = ei)) + 
           geom_point(color = "#75a876") + 
           ggtitle("Residuals Versus X") + 
           theme_grey(base_size = 11*1.5))
cat("The MSE is", sum(Norm.df["ei"]^2) / (nrow(Norm.df) - 2))
## The MSE is 211.2099

h.)

figList <- c()
for (i in rep(1,5)) {
  set.seed(i)
  xLoop <- rnorm(100, sd = 5)
  yLoop <- myNormDist + w
  
  Beta1Loop <-
  sum((xLoop - mean(xLoop)) * (yLoop - mean(yLoop))) / sum((xLoop - mean(xLoop))^2)
  Beta0Loop <- mean(yLoop) - Beta1 * mean(xLoop)
  

  Loop.df <- data.frame(xLoop, yLoop, Beta0Loop + Beta1Loop * xLoop)
  names(Loop.df) <- c("x", "y", "Ypred")
  
  ggplotly(ggplot(data = Loop.df, aes(x = x, y = y)) + 
           geom_point(color = "#7eb6ff") + 
            geom_line(data = Loop.df, aes(x = x, y = Ypred)) + 
           ggtitle("Y Versus X") + 
           theme_grey(base_size = 11*1.5))
  #figList <- c(figList, Loop.df)
}